import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from scipy.optimize import minimize

data = sio.loadmat('ex4data1.mat')
raw_X = data['X']
raw_y = data['y']

X = np.insert(raw_X, 0, values=1, axis=1)
print(X.shape)  # (5000, 401)


# 1、对y进行独热编码处理：one-hot编码
def one_hot_encoder(raw_y):
    result = []

    for i in raw_y:  # 1-10
        y_temp = np.zeros(10)
        y_temp[i-1] = 1
        result.append(y_temp)

    return np.array(result)


y = one_hot_encoder(raw_y)
print(y)
# [[0. 0. 0. ... 0. 0. 1.]
#  [0. 0. 0. ... 0. 0. 1.]
#  [0. 0. 0. ... 0. 0. 1.]
#  ...
#  [0. 0. 0. ... 0. 1. 0.]
#  [0. 0. 0. ... 0. 1. 0.]
#  [0. 0. 0. ... 0. 1. 0.]]


theta = sio.loadmat('ex4weights.mat')
theta1, theta2 = theta['Theta1'], theta['Theta2']
print(theta1.shape)  # (25, 401)
print(theta2.shape)  # (10, 26)


# 2、序列化权重参数
def serialize(a, b):
    return np.append(a.flatten(), b.flatten())


theta_serialize = serialize(theta1, theta2)
print(theta_serialize.shape)  # (10285,)


# 3、解序列化权重参数
def deserialize(theta_serialize):
    theta1 = theta_serialize[:25*401].reshape(25, 401)
    theta2 = theta_serialize[25*401:].reshape(10, 26)
    return theta1, theta2


theta1, theta2 = deserialize(theta_serialize)
print(theta1.shape)  # (25, 401)
print(theta2.shape)  # (10, 26)


# 4、前向传播
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def feed_fordword(theta_serialize, X):
    theta1, theta2 = deserialize(theta_serialize)
    a1 = X
    z2 = a1 @ theta1.T
    a2 = sigmoid(z2)
    a2 = np.insert(a2, 0, values=1, axis=1)
    z3 = a2 @ theta2.T
    h = sigmoid(z3)
    return a1, z2, a2, z3, h


# 5、损失函数
# 5.1、不带正则化的损失函数
def cost(theta_serialize, X, y):
    a1, z2, a2, z3, h = feed_fordword(theta_serialize, X)
    J = -np.sum(y * np.log(h) + (1 - y) * np.log(1 - h)) / len(X)
    return J


print(cost(theta_serialize, X, y))
# 0.2876291651613189


# 5.2、带正则化的损失函数
def reg_cost(theta_serialize, X, y, lamda):
    sum1 = np.sum(np.power(theta1[:, 1:], 2))
    sum2 = np.sum(np.power(theta2[:, 1:], 2))
    reg = (sum1 + sum2) * lamda / (2 * len(X))
    return reg + cost(theta_serialize, X, y)


lamda = 1
print(reg_cost(theta_serialize, X, y, lamda))


# 6、反向传播
# 6-1、无正则化的梯度
def sigmoid_gradient(z):
    return sigmoid(z) * (1-sigmoid(z))


def gradient(theta_serialize, X, y):
    theta1, theta2 = deserialize(theta_serialize)
    a1, z2, a2, z3, h = feed_fordword(theta_serialize, X)
    d3 = h - y
    d2 = d3 @ theta2[:, 1:] * sigmoid_gradient(z2)
    D2 = (d3.T @ a2) / len(X)
    D1 = (d2.T @ a1) / len(X)
    return serialize(D1, D2)


# 6-2、带正则化的梯度
def reg_gradient(theta_serialize, X, y, lamda):
    D = gradient(theta_serialize, X, y)
    D1, D2 = deserialize(D)

    theta1, theta2 = deserialize(theta_serialize)
    D1[:, 1:] = D1[:, 1:] + theta1[:, 1:] * lamda / len(X)
    D2[:, 1:] = D2[:, 1:] + theta2[:, 1:] * lamda / len(X)

    return serialize(D1, D2)


# 7、神经网络优化
def nn_train(X, y):
    init_theta = np.random.uniform(-0.5, 0.5, 10285)
    res = minimize(fun=reg_cost, x0=init_theta, args=(X, y, lamda), method='TNC', jac=reg_gradient, options={'maxiter': 300})
    return res


lamda = 10
res = nn_train(X, y)
print(res)

raw_y = data['y'].reshape(5000,)


_,_,_,_,h = feed_fordword(res.x, X)
y_pred = np.argmax(h, axis=1) + 1
acc = np.mean(y_pred == raw_y)

print(acc)
# 0.9994
# 优化后 0.9416


# 8、可视化隐藏层
def plot_hidden_layer(theta):
    theta1, _ = deserialize(theta)
    hidden_layer = theta1[:, 1:]  # 25 * 400

    fig, ax = plt.subplots(ncols=5, nrows=5, figsize=(8, 8), sharex=True, sharey=True)  # 10*10个图片

    for r in range(5):
        for c in range(5):
            ax[r, c].imshow(hidden_layer[5 * r + c].reshape(20, 20).T, cmap='gray_r')

    plt.xticks([])  # 取消刻度
    plt.yticks([])

    plt.show()

plot_hidden_layer(res.x)
